---
title: "Virtuous people and evil elites? The role of moralizing frames and normative distinctions in identifying populist discourse"
author: ""
date:  "`r format(Sys.time(), '%d %B, %Y')`"
output:
  pdf_document:
  latex_engine: xelatex
geometry: margin = 1.2in
header-includes:
  - \usepackage{placeins}
  - \usepackage{chngcntr}
  - \usepackage{setspace}
  - \doublespacing
  - \usepackage[utf8]{inputenc}
  - \usepackage{geometry}
  - \usepackage{float}
  - \usepackage{wrapfig}
  - \usepackage{float}
  - \usepackage{booktabs}
  - \usepackage{multirow}
  - \usepackage{color,soul}
  - \usepackage{rotating, graphicx}
  - \usepackage{ifthen}
  - \usepackage{xcolor}
  - \usepackage{booktabs}
tables: true
numbersections: true

---

# load packages
  
```{r setup_moral, include=FALSE}
knitr::opts_chunk$set(echo = FALSE,
                      warning = FALSE,
                      message = FALSE,
                      comment=FALSE,
                      cache=FALSE,
                      fig.pos='H',
                      pdf.options(useDingbats = TRUE))

p_needed <- c("quanteda", "readtext", "readr", "readxl", "writexl", "lubridate", "kableExtra", "tidyverse", "caret")

lapply(p_needed, require, character.only = TRUE)

set.seed(301190)
```

```{r, echo=TRUE}

sessionInfo()

```


# load data

```{r load_moral_data}

load("parlspeech_merged.RData")
load("parlspeech_results.RData") # this is also created below, but can be loaded directly
docvars_parl <- parlspeech_text 
docvars_parl$text <- NULL
```


```{r load dicts, include = F}

source("dictionaries_moral.R")
source("other_dicts.R")

```

# create measurment - table 4

```{r produce-snippets}

# create short version where everz speech is just once 
parlspeech_short <- parlspeech_text[!duplicated(parlspeech_text$doc_id), ]

# create the corpus
parlspeech_corpus <- corpus(parlspeech_short, docid_field = "doc_id", text_field = "text")

# TO DO: check average sentence length
snippets_pep <- kwic(tokens(parlspeech_corpus), pattern = base_dicts_people, window = 8)
snippets_eli <- kwic(tokens(parlspeech_corpus), pattern = base_dicts_elite, window = 8)


snippets_df_pep <- as.data.frame(snippets_pep)
snippets_df_eli <- as.data.frame(snippets_eli)

# get docvars back to snippets data frame 
snippets_df_pep$doc_id <- sub("\\..*", "", snippets_df_pep$docname)
snippets_df_eli$doc_id <- sub("\\..*", "", snippets_df_eli$docname)
snippets_df_pep <- left_join(snippets_df_pep, docvars_parl, by = "doc_id")
snippets_df_pep$people <- 1
snippets_df_pep$elite <- 0
snippets_df_eli <- left_join(snippets_df_eli, docvars_parl, by = "doc_id")
snippets_df_eli$people <- 0
snippets_df_eli$elite <- 1


# make one text variable
snippets_df_pep$text <- paste(snippets_df_pep$pre, snippets_df_pep$keyword, snippets_df_pep$post)
snippets_df_eli$text <- paste(snippets_df_eli$pre, snippets_df_eli$keyword, snippets_df_eli$post)

# need to delete incidents where establishment is used like "establishment of "the court"" or "freedom of establishment
snippets_df_eli <-snippets_df_eli[!grepl("freedom of establishment", snippets_df_eli$text, ignore.case = TRUE), ]
snippets_df_eli <-snippets_df_eli[!grepl("establishment of", snippets_df_eli$text, ignore.case = TRUE), ]

snippets_df_pep <-snippets_df_pep[!grepl("people's party", snippets_df_pep$text, ignore.case = TRUE), ]
snippets_df_pep <-snippets_df_pep[!grepl("fair trade", snippets_df_pep$text, ignore.case = TRUE), ]
snippets_df_pep <-snippets_df_pep[!grepl("people's republic", snippets_df_pep$text, ignore.case = TRUE), ]

# make a new document id that is unique
snippets_df_pep <- mutate(snippets_df_pep, document = rownames(snippets_df_pep))
snippets_df_eli <- mutate(snippets_df_eli, document = rownames(snippets_df_eli))

# prepare for eventually binding back onto parlspeech data frame

# rbind the two data frames
snippets_df_all <- rbind.data.frame(snippets_df_eli, snippets_df_pep)
# select only the count for elite and people and the docname
# aggregate to speech level
snippets_final <- snippets_df_all %>% 
  dplyr::select(docname, elite, people, from, to, doc_id) %>% 
  dplyr::group_by(doc_id) %>% 
  dplyr::summarise(c_elite = sum(elite), 
                   c_people = sum(people))


# prepare data for moral dicts
snippets_analysis_elite <- snippets_df_eli %>%
  mutate(doc_id_new = paste0(doc_id, ".", document)) %>%
  select(docname, from, to, keyword, doc_id, elite, people, text, doc_id_new)

snippets_analysis_people <- snippets_df_pep %>%
    mutate(doc_id_new = paste0(doc_id, ".", document)) %>%
    select(docname, from, to, keyword, doc_id, elite, people, text, doc_id_new)



```


```{r apply_dicts}
#___________________________________________________________________________
##### apply moral dictionaries on snippets  
#____________________________________________________________________________

people_corp <- corpus(snippets_analysis_people, docid_field = "doc_id_new", text_field = "text")
elite_corp <- corpus(snippets_analysis_elite, docid_field = "doc_id_new", text_field = "text")

people_dic_appl <- dfm(people_corp, tolower = T, dictionary = frame_dict_good)
# check which words are picked up by dict
people_dic_appl_test <- dfm(people_dic_appl, select = frame_dict_good, verbose = FALSE)
elite_dic_appl <- dfm(elite_corp, tolower = T, dictionary = frame_dict_bad)

# convert to data frame
people_dict_df <- convert(people_dic_appl, to = c("data.frame"))
elite_dict_df <- convert(elite_dic_appl, to = c("data.frame"))

# make bad-variable for people df
people_dict_df$bad <- 0
# make good-variale for elite df
elite_dict_df$good <- 0

# reconstruct doc_id
elite_dict_df$document <- sub("\\..*", "", elite_dict_df$document)
people_dict_df$document <- sub("\\..*", "", people_dict_df$document)

# bind the two frames
frame_dict <- rbind.data.frame(people_dict_df, elite_dict_df)
# aggregate it up to speech level
frame_dict_final <- frame_dict %>% 
  mutate(doc_id = document) %>%
  group_by(doc_id) %>%
  summarise(bad = sum(bad), 
            good = sum(good))

```


```{r apply_other_dicts}

other_dict <- dfm(parlspeech_corpus, tolower = T, dictionary = other_dicts)

# convert to data frame
other_dict_df <- convert(other_dict, to = c("data.frame"), docid_field = "docs")

# reconstruct doc_id
other_dict_df$doc_id <- other_dict_df$document
other_dict_df <- other_dict_df %>% 
  select(-document)

```

```{r combine-measures}
#### bring all dictionary results together
#### i.e. from moral dicts, base dicts, and other dicts

# for base_dicts -> snippets_final
# for frame_dicts -> frame_dict_final
# for other dicts -> other_dict_df

frame_base <- full_join(snippets_final, frame_dict_final)
frame_base_other <- full_join(frame_base, other_dict_df)

speech_lev_results <- left_join(parlspeech_text, frame_base_other, by ="doc_id")

# get rid of unneccessary variables
speech_lev_results <- speech_lev_results %>%
  select(-document_id) %>%
  mutate(no_speech = 1) 

# missings for base and frame dicts -> recode
speech_lev_results$c_elite[is.na(speech_lev_results$c_elite) == T] <- 0
speech_lev_results$c_people[is.na(speech_lev_results$c_people) == T] <- 0
speech_lev_results$bad[is.na(speech_lev_results$bad) == T] <- 0
speech_lev_results$good[is.na(speech_lev_results$good) == T] <- 0

# make dummy variables whether a speech contains populist elements
speech_lev_results <- speech_lev_results %>%
  mutate(d_bad = case_when(bad > 0 ~ 1, bad == 0 ~ 0)) %>%
  mutate(d_good = case_when(good >0 ~ 1, good == 0 ~ 0)) %>%
  mutate(d_pauwels = case_when(pauwels >0 ~ 1, pauwels == 0 ~ 0)) %>%
  mutate(d_gidron = case_when(gidron >0 ~ 1, gidron == 0 ~ 0))


```


```{r}

# clone results of results for recoding to perform validation tests
speech_lev_results_valid <- speech_lev_results

# call in different coded samples
# get rid of variables we do not need
# make sample variable in order to distinguish them 

# 1) sample frame people
sample_people <- read_excel("sample_framepeople_coded.xlsx")
# keep variables we need and create new ones
sample_people <- sample_people %>% dplyr::select(doc_id, text, peoplecentr, sample)
sample_people$handcoding <- sample_people$peoplecentr
sample_people$antielite <- NA

# 2) sample frame elite
sample_elite <- read_excel("sample_frameelite_coded.xlsx")
# keep variables we need and create new ones
sample_elite <- sample_elite %>% dplyr::select(doc_id, text, antielite, sample)
sample_elite$handcoding <- sample_elite$antielite
sample_elite$peoplecentr <- NA

# 3) sample false negatives
sample_falseneg <- read_excel("sample_falseneg_coded.xlsx")
# keep variables we need and create new ones
sample_falseneg <- sample_falseneg %>% dplyr::select(doc_id, text, handcoding, sample)
sample_falseneg$peoplecentr <- NA
sample_falseneg$antielite <- NA

# 4) sample random
sample_random <- read_excel("sample_gold_coded.xlsx")
# keep variables we need and create new ones
sample_random <- sample_random %>% dplyr::select(doc_id, text, peoplecentr, antielite, sample)
sample_random$handcoding <- if_else((sample_random$antielite != 0 |
           sample_random$peoplecentr != 0), 1, 0)


# append different samples
sample_full <- rbind.data.frame(sample_random, sample_falseneg, sample_people, sample_elite)

# remove duplicates in doc_id
speech_lev_results_valid <- speech_lev_results_valid[!duplicated(speech_lev_results_valid$doc_id), ]
# only keep vars we need: 
speech_lev_results_valid <- speech_lev_results_valid %>%
  select(doc_id, c_elite, c_people, bad, 
         good, pauwels, gidron, d_bad, d_good, d_pauwels, d_gidron)

sample_full_merged <- left_join(sample_full, speech_lev_results, by ="doc_id")

sample_full_merged$mydict <- if_else((sample_full_merged$d_good ==1 |
                                        sample_full_merged$d_bad == 1), 1, 0)

sample_full_merged$d_refs <- if_else((sample_full_merged$c_elite != 0 |
                                        sample_full_merged$c_people != 0), 1, 0)

sample_full_merged$mydict <- as.factor(sample_full_merged$mydict)
sample_full_merged$handcoding <- as.factor(sample_full_merged$handcoding)
sample_full_merged$d_gidron <- as.factor(sample_full_merged$d_gidron)
sample_full_merged$d_pauwels <- as.factor(sample_full_merged$d_pauwels)
sample_full_merged$d_refs <- as.factor(sample_full_merged$d_refs)

matrix_mydict <- caret::confusionMatrix(sample_full_merged$handcoding, sample_full_merged$mydict, positive = "1")
matrix_gidron <- caret::confusionMatrix(sample_full_merged$handcoding, sample_full_merged$d_gidron, positive = "1")
matrix_pauwels <- caret::confusionMatrix(sample_full_merged$handcoding, sample_full_merged$d_pauwels, positive = "1")
matrix_refs <- caret::confusionMatrix(sample_full_merged$handcoding, sample_full_merged$d_refs, positive = "1")

# bind confusion matrixes
table_validation <-cbind(matrix_mydict$byClass, matrix_gidron$byClass, matrix_pauwels$byClass, matrix_refs$byClass)
colnames(table_validation)<-c("My approach", "Bonikowski & Gidron", "Rooduijn & Pauwels", "References to groups")
table_validation<-rbind(table_validation,cbind(matrix_mydict$overall[['Accuracy']], matrix_gidron$overall[['Accuracy']], matrix_pauwels$overall[['Accuracy']], matrix_refs$overall[['Accuracy']]))
rownames(table_validation)[length(rownames(table_validation))]<-"Overall Accuracy"
save(table_validation,file="table_validation.RData")

# select only certain measures
table_validation_short <- table_validation[c("Sensitivity", "Specificity", "Balanced Accuracy", "Overall Accuracy"),]


knitr::kable(table_validation_short, digits=2, booktabs = T, 
             caption="\\label{tab:validation}Classification Accuracy",longtable=T) %>% 
  kable_styling(latex_options = "striped") %>% 
  row_spec(row = 0, bold=T) %>% 
  column_spec(2:5, width = "2cm")

```

```{r aggregate_full_level1}

# aggregate to full level
sumup <- speech_lev_results %>% 
  select(Populist, good, bad, no_speech) %>% 
  group_by(Populist) %>% 
  summarise(good = sum(good), bad =sum(bad)) 

```

```{r aggregate_party_level2}

# create variables
party_level <- speech_lev_results %>% 
  select(Populist, d_good, d_bad, no_speech, party_name.x, country,
         d_pauwels, d_gidron) %>% 
  group_by(party_name.x) %>% 
  summarise(good = sum(d_good), 
            bad = sum(d_bad), 
            no_speech = sum(no_speech),
            country = first(country),
            Populist = first(Populist),
            pauwels = sum(d_pauwels),
            gidron = sum(d_gidron)) %>%
  mutate(per_good = good/no_speech) %>%
  mutate(per_bad = bad/no_speech) %>%
  mutate(per_pauwels = pauwels/no_speech) %>%
  mutate(per_gidron = gidron/no_speech)

```


# Table 5


```{r}

# aggregate for visualization
agg1<- speech_lev_results %>%
  mutate(speech_count = 1) %>%
  dplyr::group_by(Populist) %>%
  dplyr::summarise(speech_count = sum(speech_count),
            bad = sum(d_bad),
            good = sum(d_good), 
            gidron = sum(d_gidron),
            pauwels = sum(d_pauwels))  %>% 
  select(bad, good, gidron, pauwels, Populist, speech_count) %>%
  round(., digits=0) %>%
    mutate(measure = "count") 

agg2 <- agg1 %>%
  mutate(bad = bad/speech_count,
         good = good/speech_count,
         gidron = gidron/speech_count, 
         pauwels = pauwels/speech_count) %>%
  select(bad, good, gidron, pauwels, Populist, speech_count) %>%
  round(., digits=3) %>%
     mutate(measure = "percent") 

  

agg <- rbind.data.frame(agg1, agg2)
agg <- agg[order(agg$Populist), ]
agg <- agg %>% select(Populist, speech_count, measure, bad, good, gidron, pauwels)

agg$Populist[agg$Populist == 0] <- "No"
agg$Populist[agg$Populist == 1] <- "Yes"


# make sure some are percent have digits and count not

agg3 <- agg %>% select(bad, good, gidron, pauwels) %>% 
  mutate(combined = bad+good) %>%
  select(combined, gidron, pauwels)
agg4 <- agg %>% select(Populist, speech_count, measure)

agg_df<- rbind(formatC(as.numeric(agg3[1,]),format="d"),
                        formatC(as.numeric(agg3[2,]),format="f",digits=3),
                        formatC(as.numeric(agg3[3,]),format="d"),
                        formatC(as.numeric(agg3[4,]),format="f",digits=3))

colnames(agg_df) <- c("combined", "gidron", "pauwels")

agg5 <- cbind.data.frame(agg4, agg_df) %>% select(Populist, speech_count, measure, combined, gidron, pauwels)

# recode percent to proportion
agg5 <- agg5 %>% 
  mutate(measure = if_else(measure == "percent", "proportion", "count"))

agg5$speech_count[agg5$measure == "percent" & agg5$Populist == "No"] <- "0.802"
agg5$speech_count[agg5$measure == "percent" & agg5$Populist == "Yes"] <- "0.197"

agg5 <- agg5 %>% select(-speech_count)
                    
colnames(agg5) <- c("Populist", "","combined","Bonikowski & Gidron","Rooduijn & Pauwels")

# create final table
kable(agg5, booktabs = T, 
            caption="\\label{sharepopulism}Classification of speeches for different dictionaries",
            longtable=T,
            col.names= c("Populist party",
                         "",
                         "My approach",
                         "Bonikowski & Gidron",
                         "Rooduijn & Pauwels")) %>% 
   kable_styling() %>% 
   row_spec(row = 0, bold=T) %>% 
  column_spec(2:5, width = "2cm") %>%
  column_spec(2, bold = F, border_right = T)
  

```



\begin{table}[h!]
\caption{Cross tabulation of anti-elitism and people-centrism being present in the same speech}
\center
\label{twobytwo}
\begin{tabular}{l|ll}
\hline
anti-elitism / people-centrism & no & yes \\ \cline{2-3} 
no & 174195 & 4055 \\
yes & 594 & 61 \\ \hline
\end{tabular}%
\end{table}


# Figure 1

```{r scattermep, fig.align='center', fig.show="hold", fig.cap="\\label{scattermep}Scatter plot for percentage of people-centrism and anti-elitism in MEPs' speeches", fig.pos="!htp", fig.width=8}


## aggregate to MEP level

mep <- speech_lev_results %>%
  mutate(speech_count = 1) %>%
  dplyr::group_by(mep_ids) %>%
  dplyr::summarise(speech_count = sum(speech_count),
            bad = sum(d_bad),
            good = sum(d_good), 
            gidron = sum(d_gidron),
            pauwels = sum(d_pauwels), 
            family_name = first(family_name_short),
            family_label = first(family_name), 
            surname.x = first(surname.x), 
            firstname.x = first(firstname.x),
            country = first(country), 
            party_id = first(party_id),
            party_name_english = first(party_name_english), 
            Populist = first(Populist))  %>%
  mutate(bad_per = bad/speech_count,
         good_per = good/speech_count,
         gidron_per = gidron/speech_count, 
         pauwels_per = pauwels/speech_count, 
         count_both = bad + good, 
         both_per = count_both/speech_count)


mep_small <- mep %>% filter(bad_per<0.1) %>%  filter(good_per<0.2)
mep_small$Populist <- as.factor(mep_small$Populist)
mep$Populist <- as.factor(mep$Populist)


 ggplot() + 
  geom_point(data= mep, mapping = aes(x = good_per, 
                                      y = bad_per, 
                                      color = Populist), 
             position = "jitter") +
  geom_smooth(data = mep, mapping = aes(x = good_per, 
                                        y = bad_per, 
                                        color = Populist), 
              method = "lm", se=FALSE, fullrange=T) +
  xlim(0, 0.2) +
  ylim(0, 0.1) +
  labs(x = "Percentage of positive references to the people") +
  labs(y = "Percentage of negative references to the elite") +
  scale_color_manual(values = c("grey", "black"), name = "MEP classified as", 
  labels = c("non-populist", "populist")) +
   theme_minimal()
  
```


# Figure 2

```{r boxplotsNEW, fig.align='center', fig.show="hold", fig.cap="\\label{boxplots}Box plots for parties by left-right dimension", fig.pos="!htp", fig.width=8, warning = FALSE, message= FALSE}


leftrightdata <- speech_lev_results %>%
  mutate(lr = round(left_right, digits=0)) %>%
  group_by(lr) %>%
  mutate(speech_count = 1) %>%
  dplyr::summarise(speech_count = sum(speech_count),
            bad = sum(d_bad),
            good = sum(d_good), 
            gidron = sum(d_gidron),
            pauwels = sum(d_pauwels))  %>%
  mutate(bad_per = bad/speech_count,
         good_per = good/speech_count,
         gidron_per = gidron/speech_count, 
         pauwels_per = pauwels/speech_count, 
         count_both = bad + good, 
         both_per = count_both/speech_count)

leftrightdata$lr <- as.factor(leftrightdata$lr)

plot1 <- ggplot(leftrightdata) +
  geom_bar(mapping = aes(x = lr, y = bad_per), stat = "identity") +
  theme_minimal() +
 ggtitle("Anti-elitism") +
  scale_y_continuous(name="percent") + 
  scale_x_discrete(name="left-right dimension") +
  #scale_x_discrete("", labels=c("0"="far-left", "1"="mainstream", "2"="far-right")) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) 

  
plot2 <- ggplot(leftrightdata) +
  geom_bar(mapping = aes(x = lr, y = good_per), stat = "identity") +
  theme_minimal() +
 ggtitle("People-centrism") +
  scale_y_continuous(name="percent") +
  scale_x_discrete(name="left-right dimension") +
  #scale_x_discrete("", labels=c("0"="far-left", "1"="mainstream", "2"="far-right")) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

plot3 <- ggplot(leftrightdata) +
  geom_bar(mapping = aes(x = lr, y = both_per), stat = "identity") +
  theme_minimal() +
 ggtitle("Both core features") +
  scale_y_continuous(name="percent") +
  scale_x_discrete(name="left-right dimension") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

plot4 <- ggplot(leftrightdata) +
  geom_bar(mapping = aes(x = lr, y = gidron_per), stat = "identity") +
  theme_minimal() +
 ggtitle("Bonikowski&Gidron") +
  scale_y_continuous(name="percent") +
  scale_x_discrete(name="left-right dimension") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

plot5 <- ggplot(leftrightdata) +
  geom_bar(mapping = aes(x = lr, y = pauwels_per), stat = "identity") +
  theme_minimal() +
 ggtitle("Rooduijn&Pauwels") +
  scale_y_continuous(name="percent") +
  scale_x_discrete(name="left-right dimension") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

plotall<- cowplot::plot_grid(plot1, plot2, plot3, plot4, plot5, labels = "AUTO",nrow=2,align="v")

plotall

  
```

# Appendix

### Number of hits of base ditionaries

```{r load_data_moral_app}

load("snippets_df_eli.RData")
load("snippets_df_pep.RData")
load("table_validation.RData")

```


```{r app_snippets_create}


# make count for keywords
# gain an overview over the keywords
tab_snippets_p <- as.data.frame(snippets_df_pep$keyword) %>% 
  dplyr::rename("keyword_pep"="snippets_df_pep$keyword") %>%
  mutate(count = 1) 

tab_snippets_p$keyword_pep <- tolower(tab_snippets_p$keyword_pep)
tab_snippets_p$keyword_pep <- str_replace_all(tab_snippets_p$keyword_pep, "'*", "")

tab_snippets_p <- tab_snippets_p %>%
  dplyr::group_by(keyword_pep) %>%
  dplyr::summarise(count_pep = sum(count)) 

tab_people_total <- tab_snippets_p %>%
  ungroup() %>%
  dplyr::summarise(count_pep = sum(count_pep)) %>%
  arrange(desc(count_pep)) %>% mutate(keyword_pep = "total") 

tab_snippets_p <- rbind.data.frame(tab_snippets_p, tab_people_total)

tab_snippets_p <- tab_snippets_p %>% 
  dplyr::group_by(keyword_pep) %>%
  dplyr::summarise(count_pep = sum(count_pep)) %>%
  arrange(desc(count_pep)) 

#make empty data frame
var1 <- rep("", 10)
var2 <- rep("", 10)
empty <- data.frame(var1, var2)
colnames(empty) <- c("keyword_pep", "count_pep")

tab_snippets_p <- rbind.data.frame(tab_snippets_p, empty)

# same for elite dictionary
tab_snippets_e <- as.data.frame(snippets_df_eli$keyword) %>% 
  dplyr::rename("keyword_eli"="snippets_df_eli$keyword") %>%
  mutate(count = 1)

tab_snippets_e$keyword_eli <- tolower(tab_snippets_e$keyword_eli)
tab_snippets_e$keyword_eli <- str_replace_all(tab_snippets_e$keyword_eli, "'*", "")


tab_snippets_e <- tab_snippets_e %>%
  dplyr::group_by(keyword_eli) %>%
  dplyr::summarise(count_eli = sum(count)) 

tab_eli_total <- tab_snippets_e %>%
  ungroup() %>%
  dplyr::summarise(count_eli = sum(count_eli)) %>%
  arrange(desc(count_eli)) %>% mutate(keyword_eli = "total") 

tab_snippets_e <- rbind.data.frame(tab_snippets_e, tab_eli_total)

tab_snippets_e <- tab_snippets_e %>% 
  dplyr::group_by(keyword_eli) %>%
  dplyr::summarise(count_eli = sum(count_eli)) %>%
  arrange(desc(count_eli)) %>%
  slice(1:27)  

# combine most common 25 words  

snippets_count <- cbind.data.frame(tab_snippets_p, tab_snippets_e)



# make kable table

knitr::kable(snippets_count, digits=2, booktabs = T, 
             caption="\\label{count_basedicts}Frequency of keywords for people and elite dictionaries",
             longtable=T, 
             col.names = c("People", "Count", "Elite", "Count"),
             linesep = "") %>% 
  kable_styling(latex_options = "striped") %>%  
  footnote(general = "For the elite dictionary, only the 25 most-occuring keyword are depicted.") %>% 
  row_spec(row = 0:1, bold=T) 


```

\newpage


### Example for snippets

```{r example-snippets}

tab_ex <- snippets_df_pep %>% select(pre, keyword, post) %>%
  sample_n(10)

ex2 <- c("able to defend the interests of", "hard-working", "Dutch taxpayers yesterday .")
ex3 <- c(". You are elected by the people to", "defend", "democracy . What do the selected")

tab_ex <- rbind.data.frame(tab_ex, ex2, ex3)

knitr::kable(tab_ex, digits=2, booktabs = T, 
             caption="\\label{snippets}Example for snippets",
             longtable=F, 
             col.names = c("Pre", "Keyword", "Post"),
             linesep = "") %>% 
  kable_styling(latex_options = c("striped", "hold_position")) %>%  
  row_spec(row = 0, bold=T) %>%
  column_spec(1, width = "6cm") %>%
  column_spec(3, width = "6cm") %>%
  column_spec(column = 2, bold =T)



```

\newpage

### Accuracy of measurement

```{r}
knitr::kable(table_validation, digits=2, booktabs = T, 
             caption="\\label{tab:validation2}Classification Accuracy",longtable=T) %>% 
  kable_styling(latex_options = "striped") %>% 
  row_spec(row = 0, bold=T) %>% 
  column_spec(2:5, width = "2cm")
```






